1. Which data bases dplyr can work with? Five commmon ones: RMySQL, RPostgreSQL, SQLite database, odbc and bigrquery.

  2. What is bigrquery and how to work with it

#library(bigrquery)
library(ggplot2)
library(tidyverse)
library(lubridate)
library(stringr)

Chicago taxi data

Data description: This is data from taxi trips reported to the city of Chicago. There are total 112860054 observations and 23 variables containing information about the taxi, time, location and expenses.

https://data.cityofchicago.org/Transportation/Taxi-Trips/wrvz-psew/data

Time estimation: download for 100 seconds, 57.1MB, 194455 rows. So the time takes to download whole dataset should be about 16 hours. Size of whole data set should be about 33GB.

df = read.csv("Taxi_Trips.csv")
df

Q: After you have data Make table with day of the week in the column name, Length of a trip in vertical (you will have to cut it), and average tip in cells

  • Clean variable “Tips”
df$Tips <- str_replace_all(df$Tips, "\\$", "") %>%
  as.numeric()
  • Get three variables
table_tips <- df %>% 
  select(Trip.Start.Timestamp, Trip.End.Timestamp, Tips) %>%
  mutate(Start_date = mdy_hms(Trip.Start.Timestamp))  %>%
  mutate(wkday = wday(Start_date, label = TRUE)) %>%
  separate(Start_date, c("st_date","st_time"), sep = " ") %>%
  separate(st_time, c("st_hr","st_min","st_sc"), sep = ":") %>%
  mutate(End_date = mdy_hms(Trip.End.Timestamp))  %>%
  separate(End_date, c("ed_date","ed_time"), sep = " ") %>%
  separate(ed_time, c("ed_hr","ed_min","ed_sc"), sep = ":") %>%
  mutate(hr = as.numeric(ed_hr) - as.numeric(st_hr)) %>%
  mutate(min = as.numeric(ed_min) - as.numeric(st_min)) %>%
  filter(hr>0) %>%
  mutate(Duration = hr*60 + min) %>%
  select(wkday,Duration,Tips) 
  table_tips 
table_tips_wkday <- table_tips %>%
  group_by(wkday,Duration) %>%
  summarise(avg_tip = mean(Tips, na.rm = T)) %>%
  spread(wkday,avg_tip)
table_tips_wkday

The ones with tip = 0 influences the result. Not sure if they are correct data.

table_tips_cuttime <- table_tips %>%
  group_by(wkday,Duration_interval = cut_width(Duration, 120, boundary = 0)) %>%
  summarise(avg_tip = mean(Tips)) %>%
  spread(wkday,avg_tip)
table_tips_cuttime
LS0tCnRpdGxlOiAiUiBOb3RlYm9vayIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKMS4gV2hpY2ggZGF0YSBiYXNlcyBkcGx5ciBjYW4gd29yayB3aXRoPwpGaXZlIGNvbW1tb24gb25lczogUk15U1FMLCBSUG9zdGdyZVNRTCwgU1FMaXRlIGRhdGFiYXNlLCBvZGJjIGFuZCBiaWdycXVlcnkuCgoyLiBXaGF0IGlzIGJpZ3JxdWVyeSBhbmQgaG93IHRvIHdvcmsgd2l0aCBpdAoKYGBge3J9CiNsaWJyYXJ5KGJpZ3JxdWVyeSkKbGlicmFyeShnZ3Bsb3QyKQpsaWJyYXJ5KHRpZHl2ZXJzZSkKbGlicmFyeShsdWJyaWRhdGUpCmxpYnJhcnkoc3RyaW5ncikKYGBgCgojIyBDaGljYWdvIHRheGkgZGF0YQojIyMgRGF0YSBkZXNjcmlwdGlvbjogVGhpcyBpcyBkYXRhIGZyb20gdGF4aSB0cmlwcyByZXBvcnRlZCB0byB0aGUgY2l0eSBvZiBDaGljYWdvLiBUaGVyZSBhcmUgdG90YWwgMTEyODYwMDU0IG9ic2VydmF0aW9ucyBhbmQgMjMgdmFyaWFibGVzIGNvbnRhaW5pbmcgaW5mb3JtYXRpb24gYWJvdXQgdGhlIHRheGksIHRpbWUsIGxvY2F0aW9uIGFuZCBleHBlbnNlcy4KaHR0cHM6Ly9kYXRhLmNpdHlvZmNoaWNhZ28ub3JnL1RyYW5zcG9ydGF0aW9uL1RheGktVHJpcHMvd3J2ei1wc2V3L2RhdGEKCiMjIyBUaW1lIGVzdGltYXRpb246IGRvd25sb2FkIGZvciAxMDAgc2Vjb25kcywgNTcuMU1CLCAxOTQ0NTUgcm93cy4gU28gdGhlIHRpbWUgdGFrZXMgdG8gZG93bmxvYWQgd2hvbGUgZGF0YXNldCBzaG91bGQgYmUgYWJvdXQgMTYgaG91cnMuIFNpemUgb2Ygd2hvbGUgZGF0YSBzZXQgc2hvdWxkIGJlIGFib3V0IDMzR0IuCgpgYGB7cn0KZGYgPSByZWFkLmNzdigiVGF4aV9Ucmlwcy5jc3YiKQpkZgpgYGAKClE6IEFmdGVyIHlvdSBoYXZlIGRhdGEgTWFrZSB0YWJsZSB3aXRoIGRheSBvZiB0aGUgd2VlayBpbiB0aGUgY29sdW1uIG5hbWUsIExlbmd0aCBvZiBhIHRyaXAgaW4gdmVydGljYWwgKHlvdSB3aWxsIGhhdmUgdG8gY3V0IGl0KSwgYW5kIGF2ZXJhZ2UgdGlwIGluIGNlbGxzCgotIENsZWFuIHZhcmlhYmxlICJUaXBzIgpgYGB7cn0KZGYkVGlwcyA8LSBzdHJfcmVwbGFjZV9hbGwoZGYkVGlwcywgIlxcJCIsICIiKSAlPiUKICBhcy5udW1lcmljKCkKYGBgCgotIEdldCB0aHJlZSB2YXJpYWJsZXMKYGBge3J9CnRhYmxlX3RpcHMgPC0gZGYgJT4lIAogIHNlbGVjdChUcmlwLlN0YXJ0LlRpbWVzdGFtcCwgVHJpcC5FbmQuVGltZXN0YW1wLCBUaXBzKSAlPiUKICBtdXRhdGUoU3RhcnRfZGF0ZSA9IG1keV9obXMoVHJpcC5TdGFydC5UaW1lc3RhbXApKSAgJT4lCiAgbXV0YXRlKHdrZGF5ID0gd2RheShTdGFydF9kYXRlLCBsYWJlbCA9IFRSVUUpKSAlPiUKICBzZXBhcmF0ZShTdGFydF9kYXRlLCBjKCJzdF9kYXRlIiwic3RfdGltZSIpLCBzZXAgPSAiICIpICU+JQogIHNlcGFyYXRlKHN0X3RpbWUsIGMoInN0X2hyIiwic3RfbWluIiwic3Rfc2MiKSwgc2VwID0gIjoiKSAlPiUKICBtdXRhdGUoRW5kX2RhdGUgPSBtZHlfaG1zKFRyaXAuRW5kLlRpbWVzdGFtcCkpICAlPiUKICBzZXBhcmF0ZShFbmRfZGF0ZSwgYygiZWRfZGF0ZSIsImVkX3RpbWUiKSwgc2VwID0gIiAiKSAlPiUKICBzZXBhcmF0ZShlZF90aW1lLCBjKCJlZF9ociIsImVkX21pbiIsImVkX3NjIiksIHNlcCA9ICI6IikgJT4lCiAgbXV0YXRlKGhyID0gYXMubnVtZXJpYyhlZF9ocikgLSBhcy5udW1lcmljKHN0X2hyKSkgJT4lCiAgbXV0YXRlKG1pbiA9IGFzLm51bWVyaWMoZWRfbWluKSAtIGFzLm51bWVyaWMoc3RfbWluKSkgJT4lCiAgZmlsdGVyKGhyPjApICU+JQogIG11dGF0ZShEdXJhdGlvbiA9IGhyKjYwICsgbWluKSAlPiUKICBzZWxlY3Qod2tkYXksRHVyYXRpb24sVGlwcykgCiAgdGFibGVfdGlwcyAKYGBgCmBgYHtyfQp0YWJsZV90aXBzX3drZGF5IDwtIHRhYmxlX3RpcHMgJT4lCiAgZ3JvdXBfYnkod2tkYXksRHVyYXRpb24pICU+JQogIHN1bW1hcmlzZShhdmdfdGlwID0gbWVhbihUaXBzLCBuYS5ybSA9IFQpKSAlPiUKICBzcHJlYWQod2tkYXksYXZnX3RpcCkKdGFibGVfdGlwc193a2RheQpgYGAKVGhlIG9uZXMgd2l0aCB0aXAgPSAwIGluZmx1ZW5jZXMgdGhlIHJlc3VsdC4gTm90IHN1cmUgaWYgdGhleSBhcmUgY29ycmVjdCBkYXRhLgoKYGBge3J9CnRhYmxlX3RpcHNfY3V0dGltZSA8LSB0YWJsZV90aXBzICU+JQogIGdyb3VwX2J5KHdrZGF5LER1cmF0aW9uX2ludGVydmFsID0gY3V0X3dpZHRoKER1cmF0aW9uLCAxMjAsIGJvdW5kYXJ5ID0gMCkpICU+JQogIHN1bW1hcmlzZShhdmdfdGlwID0gbWVhbihUaXBzKSkgJT4lCiAgc3ByZWFkKHdrZGF5LGF2Z190aXApCnRhYmxlX3RpcHNfY3V0dGltZQpgYGAK